In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
from pathlib import Path
import os
In [3]:
import pandas as pd
import numpy as np
import sys
sys.path.append("../../")
DATA_PATH = Path('../../../data/')
MODEL_PATH = Path('../../../models/keras2lwtnn')
In [4]:
width  = 6.693
height = 9.466
import matplotlib as mpl

mpl.use('pdf')

%matplotlib inline

import matplotlib.pyplot as plt
In [5]:
plt.style.use('seaborn')
In [6]:
plt.rc('font', family='serif', serif='Computer Modern Roman')
plt.rc('text', usetex=True)
# plt.rc('xtick', labelsize=8)
# plt.rc('ytick', labelsize=8)
# plt.rc('axes', labelsize=8)
# plt.rc('figure', titlesize=10)
In [7]:
from sklearn.model_selection import train_test_split
In [8]:
k = 1
data = pd.read_hdf(DATA_PATH / 'train_data.h5', 'train_set').sample(frac=k, random_state=137)
In [9]:
unused_features = [
    'seed_nbIT', 'seed_nLayers', 'seed_mva_value', 'seed_nLHCbIDs',
    'is_downstream_reconstructible_not_electron', 'is_true_seed',
    'has_MCParticle_not_electron', 'has_MCParticle'
]

label_names = [
    'is_downstream_reconstructible'
]
In [10]:
data.head()
Out[10]:
has_MCParticle is_downstream_reconstructible has_MCParticle_not_electron is_downstream_reconstructible_not_electron is_true_seed seed_chi2PerDoF seed_p seed_pt seed_nLHCbIDs seed_nbIT seed_nLayers seed_x seed_y seed_tx seed_ty seed_mva_value
1230524 True True True True False 1.224318 24853.206220 4751.315834 22 0 12 1218.260381 57.654772 0.194639 0.007061 0.894110
416777 True False True False False 2.310915 14115.601536 1086.456444 12 12 12 174.162794 14.526168 0.077169 0.002084 0.008227
1438292 False False False False False 6.570617 140397.866037 32186.729339 10 0 9 1750.810558 366.384734 0.219562 0.085235 0.002448
2222976 True False False False False 0.769758 2739.445443 1166.308990 22 0 12 -1143.220300 80.898965 -0.470408 0.010258 0.341952
2014207 True True True True False 1.118085 2894.345575 1438.781493 24 0 12 -1823.896400 -236.162727 -0.571847 -0.034705 0.856861

Training model for comparison

In [11]:
train_set, test_set = train_test_split(data, test_size=0.3, random_state=42)
In [12]:
x_train = train_set.drop(label_names + unused_features, axis=1)
y_train = train_set[label_names].copy().astype(np.int32)

x_test = test_set.drop(label_names + unused_features, axis=1)
y_test = test_set[label_names].copy().astype(np.int32)
In [13]:
def latex_rename(df):
    d = {column:column.replace("_", "\_") for column in df.columns}
    return df.rename(columns=d)
In [14]:
latex_rename(x_train).hist(bins=25, figsize=(width, height), layout=(5,2))
# plt.savefig("features_hist.pdf")
Out[14]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f3a40262b70>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f3a37f99cf8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f3a37eca128>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f3a37f5da20>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f3a37e08780>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f3a37e087b8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f3a37d61240>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f3a37cc51d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f3a37c7eb00>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f3a37c0c780>]], dtype=object)
In [15]:
temp_df = train_set.drop(label_names + ['seed_mva_value','is_downstream_reconstructible_not_electron', 'is_true_seed',
    'has_MCParticle_not_electron', 'has_MCParticle'] , axis=1)

temp_df['log10_seed_p'] = np.log10(temp_df['seed_p'])
temp_df['log10_seed_pt'] = np.log10(temp_df['seed_pt'])
del temp_df['seed_pt']
del temp_df['seed_p']

latex_rename(temp_df).hist(bins=100, figsize=(width, height),layout=(5,2))
plt.tight_layout()
# plt.savefig("features_hist.pdf")

Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(random_state=42, n_jobs=3, max_depth=20, class_weight='balanced')
forest.fit(x_train, y_train.values.ravel())
Out[16]:
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=20, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=3, oob_score=False, random_state=42,
            verbose=0, warm_start=False)
In [17]:
from sklearn.metrics import roc_auc_score
print("train roc:{:.5}".format(roc_auc_score(y_train, forest.predict_proba(x_train)[:,1])))
print("train acc: {:.5}".format(forest.score(x_train, y_train)))

print("test roc:{:.5}".format(roc_auc_score(y_test, forest.predict_proba(x_test)[:,1])))
print("test acc: {:.5}".format(forest.score(x_test, y_test)))
train roc:0.96388
train acc: 0.90903
test roc:0.91829
test acc: 0.85168

DNN

Load model

In [18]:
architecture_file = (MODEL_PATH / 'lwtnn_v1_13' / 'architecture.json').as_posix()
weights_file = (MODEL_PATH / 'lwtnn_v1_13' / 'weights.h5').as_posix()
In [19]:
with open(architecture_file) as json_file:
    json_string = json_file.readlines()[0]
In [20]:
from keras.models import model_from_json
Using TensorFlow backend.
/usr/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6
  return f(*args, **kwds)
In [21]:
model = model_from_json(json_string)
In [22]:
model.summary()
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_3 (InputLayer)         (None, 10)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 64)                704       
_________________________________________________________________
batch_normalization_7 (Batch (None, 64)                256       
_________________________________________________________________
dense_10 (Dense)             (None, 64)                4160      
_________________________________________________________________
batch_normalization_8 (Batch (None, 64)                256       
_________________________________________________________________
dense_11 (Dense)             (None, 64)                4160      
_________________________________________________________________
batch_normalization_9 (Batch (None, 64)                256       
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 65        
=================================================================
Total params: 9,857
Trainable params: 9,473
Non-trainable params: 384
_________________________________________________________________
In [23]:
model.load_weights(weights_file)

Load new data

In [24]:
new_data = pd.read_hdf(DATA_PATH / 'test_data.h5', 'test_set')
new_data = new_data.drop(unused_features, axis=1)
In [25]:
x = new_data.drop(label_names, axis=1)
y = new_data[label_names].copy().astype(np.int32)
In [26]:
latex_rename(x).hist(bins=50, figsize=(width, height), layout=(5,2))
Out[26]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f3a2797ef28>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f3a277f2c50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f3a277ced30>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f3a27723b70>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f3a276f0ba8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f3a276f0b38>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f3a2765be48>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f3a275b99b0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f3a2757b710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f3a274f5cc0>]], dtype=object)
In [27]:
weights = np.ones_like(y.values)/float(len(y.values))
plt.figure(figsize=(width/1.75,width/2.5))
plt.hist(y.values, weights=weights)
plt.title(latex_rename(y).columns[0])
# plt.savefig("class_balance.pdf")
Out[27]:
<matplotlib.text.Text at 0x7f3a26d7feb8>
In [28]:
def nn_data_preprocessing(x):
    x = x.copy()
    x.loc[:, 'seed_angle'] = (np.arctan(x['seed_y'].values/x['seed_x'].values) + 0.00059458703227136336 ) * 1.3499114679506281
    x.loc[:, 'seed_pr'] = (np.arctanh(x['seed_pt'].values/x['seed_p'].values) -0.2373458146766905) * 5.4252485447962835
    x.loc[:, 'seed_r'] = (np.sqrt(x['seed_y'].values**2 + x['seed_x'].values**2) -712.28415274688871 ) * 0.0019285089504441804
    x.loc[:, 'seed_chi2PerDoF'] = (np.sqrt(x['seed_chi2PerDoF'].values) -1.2984056703378601 ) * 2.3229264617966523
    x.loc[:, 'seed_p'] = (np.log10(x['seed_p'].values) -3.8730802388444614) * 2.2146322148563526
    x.loc[:, 'seed_pt'] = (np.log10(x['seed_pt'].values) -3.0783963469455355 ) * 4.6109558947469971
    x.loc[:, 'seed_tx'] = (x['seed_tx'].values + 0.0024932173949750538) * 3.2600086889926385
    x.loc[:, 'seed_ty'] = (x['seed_ty'].values + 0.00060514053765673619) * 14.811235511812143
    x.loc[:, 'seed_x'] = (x['seed_x'].values -0.19295878936748806) * 0.0014091856183636067
    x.loc[:, 'seed_y'] = (x['seed_y'].values + 4.7701536104670419) * 0.0019151935939470761
    x = x[['seed_chi2PerDoF','seed_p','seed_pt','seed_x','seed_y','seed_tx','seed_ty','seed_angle','seed_pr','seed_r']].copy()
    return x
    
x_nn = nn_data_preprocessing(x)
In [29]:
temp_df = x_nn.copy()

temp_df['log10_seed_p'] = temp_df['seed_p']
temp_df['log10_seed_pt'] = temp_df['seed_pt']
del temp_df['seed_pt']
del temp_df['seed_p']

latex_rename(temp_df).hist(bins=100, figsize=(width,height), layout=(5,2))
plt.tight_layout()
# plt.savefig("scaled_features.pdf")
In [30]:
x_nn.mean()
Out[30]:
seed_chi2PerDoF    0.001308
seed_p            -0.000323
seed_pt           -0.000533
seed_x             0.001867
seed_y             0.001384
seed_tx            0.001904
seed_ty            0.001175
seed_angle         0.003129
seed_pr            0.000843
seed_r            -0.000911
dtype: float64
In [31]:
y.hist()
Out[31]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f3a1fda5128>]], dtype=object)
In [32]:
model.predict(x_nn.values[:5])
Out[32]:
array([[ 0.47595966],
       [ 0.96771079],
       [ 0.95603633],
       [ 0.03467309],
       [ 0.04738973]], dtype=float32)

Compare

In [33]:
from utils import plot_roc_curve, plot_confusion_matrix

from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score
In [34]:
print("accuracy - Random Forest")
accuracy_score(y.values, forest.predict(x.values))
accuracy - Random Forest
Out[34]:
0.85135116484487139
In [35]:
print("accuracy - DNN")
accuracy_score(y.values, np.array(model.predict(x_nn.values) > 0.5, dtype=np.int32))
accuracy - DNN
Out[35]:
0.87820194324831624
In [36]:
predictions_df = pd.DataFrame(model.predict(x_nn.values), columns=y.columns, index=y.index)
In [37]:
plot_confusion_matrix(
    confusion_matrix(y, forest.predict(x)),
    classes=["ghost","true"],
    title='Normalized confusion matrix - RandomForest',
    normalize=True
)
In [38]:
plot_confusion_matrix(
    confusion_matrix(y, np.array(predictions_df.values > 0.5, dtype=np.int32)),
    classes=["ghost","true"],
    title='Normalized confusion matrix - DNN',
    normalize=True
)
In [39]:
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y, forest.predict_proba(x)[:,1])
fpr_model, tpr_model, thresholds_model = roc_curve(y, predictions_df.values)

plt.figure(figsize=(8,8))
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plot_roc_curve(fpr_model, tpr_model, "DNN")
plt.legend(loc="lower right")
Out[39]:
<matplotlib.legend.Legend at 0x7f3a21124a58>
In [40]:
# with open('roc.csv', 'w') as f:
#     print('fpr,tpr', file=f)
#     for i, (x,y1) in enumerate(zip(fpr_model, tpr_model)):
#         if i%100 == 0:
#             print("{},{}".format(x,y1), file=f)
In [47]:
import plotly.offline as py
py.init_notebook_mode()
import plotly.graph_objs as go

def plot_true_positives_and_negatives(
        y_true,
        probabilities,
        normalize=False,
        step=0.1,
        title='True positives and true negatives vs threshold', ):

    thresholds = np.arange(0.0, 1.0, step)
    true_positives_rate = np.empty(thresholds.shape)
    true_negatives_rate = np.empty(thresholds.shape)
    for i, threshold in enumerate(thresholds):
        classified_examples = np.array(
            probabilities > threshold, dtype=int)
        cm = confusion_matrix(y_true, classified_examples)
        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        true_positives_rate[i] = cm[1, 1]
        true_negatives_rate[i] = cm[0, 0]

    plots = [
        go.Scatter(x=thresholds, y=true_positives_rate, name='true positives'),
        go.Scatter(x=thresholds, y=true_negatives_rate, name='true negatives'),
    ]
    layout = go.Layout(
        title=title,
        xaxis=dict(title='threshold'),
    )
    fig = go.Figure(data=plots, layout=layout)

    py.iplot(fig)
    return true_positives_rate, true_negatives_rate, thresholds
In [54]:
plot_true_positives_and_negatives(
    y, forest.predict_proba(x)[:,1],
    title='Thresholds - RandomForest',
    step=1e-2,
    normalize=True
)
Out[54]:
(array([  9.99594344e-01,   9.98523722e-01,   9.97650792e-01,
          9.96860021e-01,   9.96005063e-01,   9.95114161e-01,
          9.94230962e-01,   9.93363167e-01,   9.92495372e-01,
          9.91545419e-01,   9.90223188e-01,   9.88826500e-01,
          9.87465757e-01,   9.86220548e-01,   9.84890614e-01,
          9.83468252e-01,   9.82066430e-01,   9.80695417e-01,
          9.79147250e-01,   9.77696647e-01,   9.75809578e-01,
          9.73866026e-01,   9.72017469e-01,   9.70125265e-01,
          9.68423053e-01,   9.66605305e-01,   9.64633511e-01,
          9.62651447e-01,   9.60643709e-01,   9.58592324e-01,
          9.56232846e-01,   9.53922150e-01,   9.51683342e-01,
          9.49398320e-01,   9.46892499e-01,   9.44312221e-01,
          9.41657488e-01,   9.39059239e-01,   9.36332617e-01,
          9.33441679e-01,   9.30175895e-01,   9.27187395e-01,
          9.24014039e-01,   9.20553129e-01,   9.17369503e-01,
          9.13826436e-01,   9.10332150e-01,   9.06660710e-01,
          9.02881438e-01,   8.98914743e-01,   8.94462802e-01,
          8.90241930e-01,   8.85972277e-01,   8.81610196e-01,
          8.77114608e-01,   8.72441867e-01,   8.67607377e-01,
          8.62667622e-01,   8.57275997e-01,   8.51838159e-01,
          8.46310460e-01,   8.40775059e-01,   8.34954672e-01,
          8.28664443e-01,   8.22340838e-01,   8.15542256e-01,
          8.08504903e-01,   8.01097837e-01,   7.92892298e-01,
          7.84209216e-01,   7.74609557e-01,   7.64750586e-01,
          7.54113681e-01,   7.42752758e-01,   7.30013120e-01,
          7.16433928e-01,   7.01247519e-01,   6.84435921e-01,
          6.66248174e-01,   6.45831889e-01,   6.22986806e-01,
          5.97869538e-01,   5.70482653e-01,   5.39760663e-01,
          5.06050173e-01,   4.69037954e-01,   4.29758686e-01,
          3.85467775e-01,   3.38599153e-01,   2.86308098e-01,
          2.31536895e-01,   1.77155944e-01,   1.23830210e-01,
          8.19475575e-02,   4.97236151e-02,   2.83727821e-02,
          1.05701515e-02,   2.45704031e-03,   4.39032280e-04,
          1.30939452e-04]),
 array([ 0.01648625,  0.24455407,  0.30318414,  0.33398693,  0.35683908,
         0.3796196 ,  0.39963351,  0.41244587,  0.4238615 ,  0.43346555,
         0.45139927,  0.47236523,  0.48786661,  0.50088788,  0.51264971,
         0.52310732,  0.53350822,  0.54369126,  0.55353407,  0.56313812,
         0.57628474,  0.58779587,  0.59789236,  0.60724572,  0.61623198,
         0.6247258 ,  0.63322858,  0.64160601,  0.65002223,  0.65793408,
         0.66741876,  0.67568576,  0.68346032,  0.6909633 ,  0.69807531,
         0.70517837,  0.7125172 ,  0.71957847,  0.72642486,  0.73305339,
         0.74093241,  0.74769524,  0.75393876,  0.76017035,  0.76630346,
         0.77215602,  0.77826524,  0.78387606,  0.78960626,  0.79529169,
         0.80123677,  0.80651332,  0.81198984,  0.8171888 ,  0.82228032,
         0.82720172,  0.83190228,  0.83657   ,  0.84132129,  0.84577413,
         0.85055526,  0.85511554,  0.85969075,  0.86396751,  0.86817562,
         0.87238672,  0.87657095,  0.88055225,  0.88456339,  0.88869689,
         0.89296768,  0.89716982,  0.90133615,  0.90546667,  0.90966881,
         0.91406495,  0.91827008,  0.92250804,  0.92691312,  0.93150624,
         0.93601877,  0.94082079,  0.94581084,  0.95082178,  0.95563872,
         0.96053924,  0.96528157,  0.97000003,  0.97480802,  0.97970854,
         0.98451951,  0.98881119,  0.99238958,  0.99537108,  0.99727816,
         0.99855253,  0.99946279,  0.99984182,  0.99995523,  0.99998508]),
 array([ 0.  ,  0.01,  0.02,  0.03,  0.04,  0.05,  0.06,  0.07,  0.08,
         0.09,  0.1 ,  0.11,  0.12,  0.13,  0.14,  0.15,  0.16,  0.17,
         0.18,  0.19,  0.2 ,  0.21,  0.22,  0.23,  0.24,  0.25,  0.26,
         0.27,  0.28,  0.29,  0.3 ,  0.31,  0.32,  0.33,  0.34,  0.35,
         0.36,  0.37,  0.38,  0.39,  0.4 ,  0.41,  0.42,  0.43,  0.44,
         0.45,  0.46,  0.47,  0.48,  0.49,  0.5 ,  0.51,  0.52,  0.53,
         0.54,  0.55,  0.56,  0.57,  0.58,  0.59,  0.6 ,  0.61,  0.62,
         0.63,  0.64,  0.65,  0.66,  0.67,  0.68,  0.69,  0.7 ,  0.71,
         0.72,  0.73,  0.74,  0.75,  0.76,  0.77,  0.78,  0.79,  0.8 ,
         0.81,  0.82,  0.83,  0.84,  0.85,  0.86,  0.87,  0.88,  0.89,
         0.9 ,  0.91,  0.92,  0.93,  0.94,  0.95,  0.96,  0.97,  0.98,  0.99]))
In [55]:
true_positives_rate, true_negatives_rate, thresholds = plot_true_positives_and_negatives(
    y,  predictions_df.values,
    title='Thresholds - DNN',
    step=1e-2,
    normalize=True
)
In [50]:
# with open('tpr_fpr.csv', 'w') as f:
#     print('threshold,tpr,fpr', file=f)
#     for x,y1,y2 in zip(thresholds, true_positives_rate, true_negatives_rate):
#         print("{},{},{}".format(x,y1,y2), file=f)
In [51]:
print("ROC score Random Forest")
roc_auc_score(y.values, forest.predict_proba(x)[:,1])
ROC score Random Forest
Out[51]:
0.91801318518483521
In [52]:
print("ROC score DNN")
roc_auc_score(y, predictions_df.values)
ROC score DNN
Out[52]:
0.94191462599596587
In [53]:
cm = confusion_matrix(y, predictions_df.values > 0.5)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
cm
In [ ]:
cm = confusion_matrix(y, predictions_df.values > 0.245)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
cm
In [ ]:
from sklearn.metrics import log_loss
In [ ]:
log_loss(y, predictions_df.values)
In [ ]:
log_loss(y, forest.predict_proba(x.values)[:,1])
In [ ]:
dnn_pred = pd.DataFrame(model.predict(nn_data_preprocessing(x_train).values), columns=y_train.columns, index=y_train.index) 
In [ ]:
dnn_pred['decision'] = dnn_pred['is_downstream_reconstructible'] > 0.5
In [ ]:
dnn_pred['correct'] = dnn_pred['decision'] == y_train['is_downstream_reconstructible']
In [ ]:
dnn_pred.head()
In [ ]:
def inv_expit(x):
    return -np.log(1.0/x -1.0)
In [ ]:
inv_expit(dnn_pred[dnn_pred['correct'] == False]['is_downstream_reconstructible']).hist(bins=100, figsize=(width*1.5,width), alpha=.75, normed=True)
In [ ]:
np.max(inv_expit(dnn_pred[dnn_pred['correct'] == False]['is_downstream_reconstructible']))
In [ ]:
np.max(inv_expit(dnn_pred[dnn_pred['correct'] == True]['is_downstream_reconstructible']))
In [ ]:
np.min(inv_expit(dnn_pred[dnn_pred['correct'] == True]['is_downstream_reconstructible']))
In [ ]:
inv_expit(dnn_pred[dnn_pred['correct'] == True]['is_downstream_reconstructible']).hist(bins=100, alpha=0.75, normed=True)
In [ ]:
inv_expit(dnn_pred[dnn_pred['correct'] == False]['is_downstream_reconstructible']).hist(bins=100, figsize=(width*1.5,width), alpha=.75, normed=True)
inv_expit(dnn_pred[dnn_pred['correct'] == True]['is_downstream_reconstructible']).hist(bins=100, alpha=0.75, normed=True)
plt.title("Wrong decisions")
plt.rc('text', usetex=True)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
plt.rc('axes', labelsize=10)
plt.rc('figure', titlesize=12)
plt.xlabel("$\hat{y}$")
plt.tight_layout()
# plt.savefig("wrong.pdf")
In [ ]:
dnn_pred[dnn_pred['correct'] == True]['is_downstream_reconstructible'].hist(bins=100, figsize=(width/2,width/2.5))
plt.title("Correct decisions")
plt.xlabel("$\hat{y}$")
plt.tight_layout()
# plt.savefig("correct.pdf")